import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB  # 特征：词的次数
import joblib

data = pd.read_csv("../ML_data/数据评论_测试.csv",
                   encoding='ANSI')

test = data.tail(3)  # 后3条
test_org = test.copy()


# 1. 文本内容进行分词
def split_text(val):
    return " ".join(list(jieba.cut(val)))


test["内容 "] = test["内容 "].transform(split_text)


# 2. 词频向量化
cnt = joblib.load("词频向量化.model")
# print(cnt.get_feature_names())
X_test = cnt.transform(test["内容 "]).toarray()

print("测试集特征\n", X_test)


nb = joblib.load("数据分类_贝叶斯.model")
y_pred = nb.predict(X_test)
print("预测结果\n", y_pred)
# print("真实结果\n", test["评价"].tolist())
# print("预测概率\n", nb.predict_proba(X_test))

test_org["评价"] = y_pred
test_org.to_excel("预测后结果.xlsx")